library("here")
library("tidyverse")07_analysis3
01_load
Libraries
Download data
The data and _raw directories are only created if they do not already exist.
source("99_proj_func.R")
create_directory(here("data"))
create_directory(here("data", "_raw"))if(!file.exists(here("data", "_raw", "GSE41080_RAW.tar"))){
curl::curl_download(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE41080&format=file",
destfile = here("data", "_raw", "GSE41080_RAW.tar"))
}Expression
untar(here("data", "_raw", "GSE41080_RAW.tar"),
exdir = here("data", "expression"))expression_files <- list.files(here("data", "expression"))The first file in expression_files is a reference file for the probe IDs in the other files. We do not use this file and as a result it is not read. The column names are added manually and the first line skipped because column names in the files contain the general column name plus a number that is different.
data_expression <- read_delim(here("data", "expression", expression_files[2:92]),
id = "file",
col_names = c("PROBE_ID",
"NBEADS1",
"Signal",
"Pval",
"NARRAYS",
"ARRAY_STDEV",
"BEAD_STDERR",
"NBEADS2",
"SEARCH_KEY"),
skip = 1,
na = "NaN") |>
mutate(file = str_extract(file, "GSM[0-9]+"))write_tsv(data_expression, here("data", "01_expression_dat_load.tsv.gz"))Meta data
The rows after row 89 contain no information and are as a result not included
metadata <- read_delim(here("data", "_raw", "msb201315-s2.csv"),
delim = ",",
na = "NA" ,
n_max= 89)write_tsv(metadata, here("data", "01_meta_dat_load.tsv.gz"))02_clean
Libraries
library("here")
library("tidyverse")Expression data
For the analysis only the probe ID and signal is needed. The p-value is used to choose which probes to work with. The file identifier is needed to combine the expression data and the metadata. Two people did not complete the trial and as a result there is no metadata for them so their expression data is removed.
data_expression <- read_tsv(here("data", "01_expression_dat_load.tsv.gz")) |>
select(file, PROBE_ID, Signal, Pval) |>
filter(file != "GSM1008315" & file != "GSM1008340")For the analysis only probes with an average p-value under 0.05 are used
data_expression <- data_expression |>
group_by(PROBE_ID) |>
nest() |>
mutate(Avg_Pval = map_dbl(.x = data,
.f = ~mean(pull(.x, Pval)))) |>
unnest(cols = c(data)) |>
ungroup() |>
filter(Avg_Pval < 0.05) |>
select(-Avg_Pval)write_tsv(data_expression, here("data", "02_expression_dat_clean.tsv.gz"))Metadata
metadata <- read_tsv(here("data", "01_meta_dat_load.tsv.gz"))
metadata <- metadata |>
select(-starts_with("Season"),
-starts_with("VacType"),
-Prior_vac,
-VACCOUNT,
-FLU_EVR,
-FLU_HOSP)
write_tsv(metadata, here("data", "02_meta_dat_clean.tsv"))03_augment
library("tidyverse")
library("here")Load the data from the final files.
metadata <- read_tsv(here("data", "02_meta_dat_clean.tsv"))
analysis_data <- read_tsv(here("data", "02_expression_dat_clean.tsv.gz"))In order to create the final dataset, we need to pivot wider the data with mRNA expression using as row names the sample IDs, column names will be the Probe IDs and the values are the signals for each Probe.
analysis_data <- analysis_data |>
pivot_wider(id_cols = file,
names_from = PROBE_ID,
values_from = Signal) We, also, need to connect the two datasets by the IDs.
mapping_key <- analysis_data |>
select(file) |>
distinct()
metadata <- metadata |>
mutate(mapping_key) |>
select(-xID)Now we can connect the two datasets using as key to join the mapping_key created.
analysis_data <- metadata |>
full_join(analysis_data, by = "file") |>
relocate("file")
analysis_data <- analysis_data |>
rename(EBV_pos = "EBV (1 = pos)",
CMV_pos = "CMV (1= pos)")
analysis_data <- analysis_data |>
mutate(
Age_Group = case_when(
Age >= 20 & Age <= 30 ~ "Young",
Age >= 61 ~ "Older"),
Cytomegalovirus = case_when(
CMV_pos == 1 ~ "Positive",
CMV_pos == 0 ~ "Negative"),
EpsteinBarrvirus = case_when(
EBV_pos == 1 ~ "Positive",
EBV_pos == 0 ~ "Negative")) |>
relocate(Age_Group, .after = Age) |>
relocate(Cytomegalovirus, .after = BMI) |>
relocate(EpsteinBarrvirus, .after = BMI ) |>
select(-`CMV_pos`, -EBV_pos) The code checks whether each person had a 4-fold antibody increase for the three flu strains, counts how many strains they responded to, and labels them as poor or good responders. It then reorganizes these new columns and summarizes response rates by age group and number of strains for plotting.
analysis_data <- analysis_data |>
mutate(
H1N1_fold = (H1N1v3 / H1N1v1) >= 4,
B_fold = (Bv3 / Bv1) >= 4,
H3N2_fold = (H3N2v3 / H3N2v1) >= 4,
Strains = H1N1_fold + B_fold + H3N2_fold,
Response_group = if_else(Strains <= 1, "Poor responders", "Good responders")
)
analysis_data <- analysis_data |>
relocate(H1N1_fold, B_fold, H3N2_fold, Strains, Response_group,
.after = Cytomegalovirus)write_tsv(analysis_data, here("data", "03_expression_dat_aug.tsv.gz"))
rm(list = ls())04_describe
Loading the libraries we will need
library("table1")
library("tidyverse")
library("here")
source("99_proj_func.R")
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)Loading the dataset we are going to analyse.
analysis_data <- read_tsv(here("data", "03_expression_dat_aug.tsv.gz"))To describe the dataset, we are making two tables. The first table shows the average age and BMI of the two groups, Old and young, as well as the range of each measure in both groups. The second table shows the distribution of each group in the three variables: Gender, Cytomegalovirus and Epstein-Barr virus (positive/negative).
numeric_summary <- analysis_data |>
group_by(Age_Group) |>
summarize(Age_Median_Range = str_c(
round(median(Age, na.rm = TRUE), 1),
" (",
min(Age, na.rm = TRUE),
"–",
max(Age, na.rm = TRUE),
")"),
BMI_Median_Range = str_c(
round(median(BMI, na.rm = TRUE), 1),
" (",
min(BMI, na.rm = TRUE),
"–",
max(BMI, na.rm = TRUE),
")"),
.groups = 'drop')
gender_table <- get_categorical_summary(analysis_data, Gender) |>
mutate(Variable = "Gender")
cmv_table <- get_categorical_summary(analysis_data, Cytomegalovirus) |>
mutate(Variable = "Cytomegalovirus")
ebv_table <- get_categorical_summary(analysis_data, EpsteinBarrvirus) |>
mutate(Variable = "EpsteinBarrvirus")
categorical_summary <- bind_rows(gender_table, cmv_table, ebv_table)
final_categorical_table <- categorical_summary |>
pivot_wider(names_from = Age_Group,
values_from = Value,
id_cols = c(Variable, Characteristic),
names_sort = TRUE)
print(numeric_summary)# A tibble: 2 × 3
Age_Group Age_Median_Range BMI_Median_Range
<chr> <chr> <chr>
1 Older 78 (61–93) 25.1 (18–47.3)
2 Young 24 (20–30) 22.9 (18.8–43.6)
print(final_categorical_table)# A tibble: 6 × 4
Variable Characteristic Older Young
<chr> <chr> <chr> <chr>
1 Gender Female 40 (67%) 14 (48%)
2 Gender Male 20 (33%) 15 (52%)
3 Cytomegalovirus Negative 24 (40%) 13 (45%)
4 Cytomegalovirus Positive 36 (60%) 16 (55%)
5 EpsteinBarrvirus Negative 19 (32%) 13 (45%)
6 EpsteinBarrvirus Positive 41 (68%) 16 (55%)
05_analysis1
Libraries
library("tidyverse")
library("here")
library("broom")
library("patchwork")
library("ggridges")
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)PCA
A PCA on just the signals from the probes is done to see if it can show a connection between the signals pre vaccine and the response to the vaccine. The PCA is done with inspiration from “PCA tidyverse style” by Claus O. Wilke
analysis_data <- read_tsv(here("data", "03_expression_dat_aug.tsv.gz"))pca <- analysis_data |>
select(starts_with("ILMN_")) |>
scale() |>
prcomp()Variance explained by principal components
pca_var <- pca |>
tidy("eigenvalues")
p1 <- pca_var |>
ggplot(aes(x = PC,
y = percent,
fill = "#ED6F58",
color = "#ED6F58")) +
geom_col() +
labs(title = "Individual",
subtitle = "All PCs",
y = "Explained Variance") +
theme_minimal() +
theme(legend.position = "none")
p2 <- pca_var |>
filter(PC < 6) |>
ggplot(aes(x = PC,
y = percent,
fill = "#ED6F58",
color = "#ED6F58")) +
geom_col() +
labs(title = "Individual",
subtitle = "5 first PCs",
y = "Explained Variance") +
theme_minimal() +
theme(legend.position = "none")
p3 <- pca_var |>
ggplot(aes(x = PC,
y = cumulative)) +
geom_line() +
geom_point(color = "#ED6F58") +
labs(title = "Cumulative",
y = "Explained Variance") +
scale_y_continuous(breaks = seq(from = 0.4,
to = 1,
by = 0.1)) +
theme_minimal() +
theme(legend.position = "none")
p1 / (p2 + p3)ggsave(filename = "05_variance_plot.png",
path = here("results"))The cumulative plot shows that the first 5 principal components explain over 70% of the variance. As a result they are examined to see if they can be used to separate good and poor responders.
Data in principal component coordinates
First the data in the coordinates for the 5 first principal components is split into good or poor response to find the 2 principal components that are most likely to separate them.
pca |>
augment(analysis_data) |>
pivot_longer(cols = starts_with(".fitted"),
names_to = "PC",
values_to = "Coordinates") |>
mutate(PC = as.numeric(str_extract(PC,
"[0-9]+"))) |>
filter(PC < 6) |>
mutate(PC = factor(PC)) |>
ggplot(aes(x = Coordinates,
y = PC,
fill = Response_group,
color = Response_group)) +
geom_density_ridges(alpha = 0.5,
scale = 1.2) +
scale_fill_manual(values = c("#E69F00", "#0072B2")) +
scale_color_manual(values = c("#E69F00", "#0072B2")) +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank())ggsave(filename = "05_1d_plot.png",
path = here("results"))The plot shows that none of the 5 first principal componets separate the good and poor responders well. Principal component 2 and 3 are chosen as they show most separation and maybe together can separate the type of response better.
pca |>
augment(analysis_data) |>
rename_with(.cols = starts_with(".fitted"),
.fn = ~str_extract(.x,
"PC[0-9]+")) |>
ggplot(aes(x = PC2,
y = PC3,
color = Response_group)) +
geom_point(size = 3) +
scale_color_manual(values = c("#E69F00", "#0072B2")) +
theme_minimal() +
theme(legend.title = element_blank(),
legend.box.background = element_rect(color = "lightgrey"))ggsave(filename = "05_2d_plot.png",
path = here("results"))The plot shows no separation so the PCA does not show a connection between the signals pre vaccine and the response to it.
06_analysis2
To examine the connection between poor or good response to the vaccine and some of the metadata the figures in figure 2 in the paper are recreated
Plot A:
library("tidyverse")
library("here")
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)analysis_data <- read_tsv(here("data", "03_expression_dat_aug.tsv.gz"))plot_data <- analysis_data |>
group_by(Age_Group, Strains) |>
summarise(n = n(), .groups = "drop") |>
group_by(Age_Group) |>
mutate(rate = 100 * n / sum(n)) |>
ungroup() |>
mutate(Strains = factor(Strains, levels = 0:3))ggplot(plot_data,
aes(x = as.numeric(as.character(Strains)),
y = rate,
fill = Age_Group)) +
geom_col(position = "dodge", colour = "black", width = 0.7) +
scale_x_continuous(breaks = 0:3, labels = 0:3) +
scale_fill_manual(values = c("Older" = "white", "Young" = "black")) +
geom_vline(xintercept = 1.5, linetype = "dashed") +
labs(x = "Number of strains",
y = "Seroconversion rate (%)",
fill = NULL) +
theme_bw() +
theme(panel.grid = element_blank())If only 0 or 1 of the strains had a 4-fold antibody increase they were labeled poor responders (left of the dotted line). The good responders were those where 2 or 3 of the strains had a 4-fold antibody increase (right of the dotted line).
ggsave(filename = "06_A_plot.png",
path = here("results"))Plot B:
plot_b_data <- analysis_data |>
count(Age_Group, Response_group) |>
group_by(Age_Group) |>
mutate(percent = 100 * n / sum(n)) |>
ungroup() |>
mutate(
Age_Group = factor(Age_Group, levels = c("Young", "Older")),
Response_group = factor(Response_group, levels = c("Poor responders", "Good responders"))
)
ggplot(plot_b_data, aes(y = Age_Group, x = percent, fill = Response_group)) +
geom_col(position = "fill", colour = "black", width = 0.6) +
scale_x_continuous(labels = scales::percent_format()) +
scale_fill_manual(
values = c("Poor responders"="white", "Good responders" = "black"),
labels = c("PR", "GR")
) +
labs(x = "Percent response", y = NULL, fill = NULL) +
theme_bw() +
theme(panel.grid = element_blank())There is larger percent of the young age group that are good responders than in the older age group. The color of the plot makes it difficult to see that the white is also a group so it is improved with color.
ggsave(filename = "06_B1_plot.png",
path = here("results"))Improved version of plot B:
plot_b_data <- analysis_data |>
count(Age_Group, Response_group) |>
group_by(Age_Group) |>
mutate(percent = 100 * n / sum(n)) |>
ungroup() |>
mutate(
Age_Group = factor(Age_Group, levels = c("Young", "Older")),
Response_group = factor(Response_group, levels = c("Poor responders", "Good responders"))
)
ggplot(plot_b_data, aes(y = Age_Group, x = percent, fill = Response_group)) +
geom_col(position = "fill", colour = "black", width = 0.6) +
scale_x_continuous(labels = scales::percent_format()) +
scale_fill_manual(
values = c("Poor responders"="#0072B2", "Good responders" = "#E69F00"),
labels = c("PR", "GR")
) +
labs(x = "Percent response", y = NULL, fill = NULL) +
theme_bw() +
theme(panel.grid = element_blank())ggsave(filename = "06_B2_plot.png",
path = here("results"))Plot C:
plot_c_data <- analysis_data |>
mutate(pre_GMT = exp(rowMeans(log(cbind(H1N1v1, Bv1, H3N2v1)))),
log_pre_GMT = log(pre_GMT))
ggplot(plot_c_data, aes(x = Age_Group, y = log_pre_GMT)) +
geom_boxplot(fill = "white", color = "black") +
labs(x = NULL, y = "log(pre-GMT)") +
theme_bw() +
theme(panel.grid = element_blank(),
legend.position = "none")The geometric mean titer pre vaccine is higher in the young age group than in the older
ggsave(filename = "06_C_plot.png",
path = here("results"))07_analysis3
Load libraries and data
library("tidyverse")
library("here")
library("purrr")
library("broom")
library("ggrepel")
source("99_proj_func.R")
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
analysis_data <- read_tsv(here("data", "03_expression_dat_aug.tsv.gz"))In this analysis, we are going to use the Probes signal prior to vaccination from all candidates to find possible biomarkers that predict the response to the vaccine.
First step, is to make a long table of our data
analysis_data_long <- analysis_data |>
select(file, Response_group, starts_with("ILMN_")) |>
pivot_longer(cols = !c(file, Response_group),
names_to = "PROBE_ID",
values_to = "log2_signal")Then, we apply T-test to the signal data for each probe comparing the Good responders to the Poor responders. Since we have the Fold Change between the 2 groups, we can now apply the tidy() formula to calculate the Pvalue, and from that the Pvalue adjusted.
analysis_data_final <- analysis_data_long |>
group_by(PROBE_ID) |>
nest() |>
mutate(
ttest_results = map(data, ~ t.test(log2_signal ~ Response_group, data = .x, var.equal = FALSE)),
tidy_results = map(ttest_results, tidy)) |>
unnest(tidy_results) |>
ungroup() |>
mutate(p.adj = p.adjust(p.value, method = "BH")) |>
mutate(log2_FC = estimate1 - estimate2 ) |>
select(PROBE_ID, log2_FC, p.value, p.adj) |>
arrange(p.adj)Let’s visualize our results with a volcano plot.
analysis_data_final |>
ggplot(aes(x = log2_FC, y = -log10(p.adj))) +
geom_point(aes(alpha = 0.05), color = "salmon", position="dodge") +
geom_text_repel(
data = analysis_data_final |>
filter(p.adj < 0.03, abs(log2_FC) > 1.5),
aes(label = PROBE_ID),
size = 2.5,
color = "turquoise3",
max.overlaps = 50,
nudge_y = 0.1) +
scale_y_continuous(limits = c(0, max(-log10(analysis_data_final |> pull(p.adj)), na.rm = TRUE) * 1.25)) +
labs(
title = "Probes predicting the vaccine response",
subtitle = "Probes highlighted in turquoise were significant after multiple testing correction",
x = expression(log[2]~"Fold Change (Good / Poor Responder)"),
y = expression(-log[10]~"Adjusted p-value (FDR)"),
caption = "Data from GSE41080") +
theme_minimal() +
theme(legend.position = "none")ggsave(filename = "07_volcano_plot.png",
path = here("results"))Warning: Width not defined
ℹ Set with `position_dodge(width = ...)`
Warning: ggrepel: 251 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
We observe that probes ILMN_1688780 and ILMN_1739792 are the most differentiated probes, based on the visible distance from the center and high -log10(p.adj) value. In other words, these probes showed significantly lower baseline expression in candidates that proved to be good responders. To find the best predictor we will use a boxplot.
analysis_data_long |>
filter(PROBE_ID == c("ILMN_1688780", "ILMN_1739792")) |>
ggplot(aes(x= Response_group, y = log2_signal, fill = Response_group)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.15, fill = "white", outlier.shape = NA) +
geom_jitter(shape = 16, position = position_jitter(0.2), size = 2, alpha = 0.7) +
facet_wrap(~ PROBE_ID, scales = "free_y", ncol = 4) +
coord_cartesian(ylim = c(min(analysis_data_long |> pull(log2_signal), na.rm = TRUE) * 1.1,
max(analysis_data_long |> pull(log2_signal), na.rm = TRUE) * 1.05)) +
labs(
title ="Baseline Expression of possible Predictors:\nILMN_1688780 and ILMN_1739792",
subtitle = "Lower pre-vaccine expression predicts a Good Response",
x = "Vaccine Antibody Response Group",
y = expression(log[2]~"Signal (Pre-Vaccine Expression)")
) +
scale_fill_manual(values = c("Good responders" = "turquoise3", "Poor responders" = "salmon")) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))ggsave(filename = "07_boxplot.png",
path = here("results"))Based on the final plot, we observe that the most significant differentiation between the two response groups comes from probe ILMN_1688780.